Mehrnoosh Hasanzade
# import a whole library as a new name
import pandas as pd
import spacy
import subprocess
import pysrt
# import package as its own name
import nltk
nltk.download('punkt')
print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import re
# remove_nois function
def remove_noise(text):
text = re.sub("<.*>", " ", text)
text = re.sub("{.*}", " ", text)
text = re.sub("\[.*\]", " ", text)
text = re.sub("- \n-", " ", text)
text = re.sub("-", " ", text) # Removes hyphens
text = re.sub("_", " ", text) # Removes underscores
text = re.sub("&", " ", text) # Removes &
text = text.strip()
return text
# impurity function
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
# RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\"]')
def impurity(text, min_len=10):
"""returns the share of suspicious characters in a text"""
if text == None or len(text) < min_len:
return 0
else:
return len(RE_SUSPICIOUS.findall(text))/len(text)
# rest of stuff
import textacy.preprocessing as tprep
def normalize(text):
text = tprep.normalize.hyphenated_words(text)
text = tprep.normalize.quotation_marks(text)
text = tprep.normalize.unicode(text)
text = tprep.remove.accents(text)
text = tprep.replace.phone_numbers(text)
text = tprep.replace.urls(text)
text = tprep.replace.emails(text)
text = tprep.replace.user_handles(text)
text = tprep.replace.emojis(text)
return text
# install pyspellchecker !!!
from spellchecker import SpellChecker
spell = SpellChecker()
from itertools import chain
from collections import Counter
import textacy
import summa
from summa import keywords
from snorkel.preprocess import preprocessor
from snorkel.types import DataPoint
from itertools import combinations
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
import networkx as nx
from matplotlib import pyplot as plt
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 34.8 MB/s eta 0:00:00
Requirement already satisfied: spacy<3.8.0,>=3.7.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from en-core-web-sm==3.7.1) (3.7.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.0)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (6.4.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.6.4)
Requirement already satisfied: jinja2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.2)
Requirement already satisfied: setuptools in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (56.0.0)
Requirement already satisfied: packaging>=20.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (23.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)
Requirement already satisfied: numpy>=1.15.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.24.3)
Requirement already satisfied: annotated-types>=0.4.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)
Requirement already satisfied: pydantic-core==2.16.3 in /opt/python/envs/minimal/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.16.3)
Requirement already satisfied: typing-extensions>=4.6.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.10.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.2.0)
Requirement already satisfied: idna<4,>=2.5 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.16)
Requirement already satisfied: certifi>=2017.4.17 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2023.7.22)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/python/envs/minimal/lib/python3.8/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.3)
[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: pip install --upgrade pip
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
[nltk_data] Package punkt is already up-to-date!
Proposed Text
- Import the text into your report from your proposal.
- If the text is one big, long string, first break into sentence segments and store it in a Pandas DataFrame.
subs = pysrt.open("Interstellar.srt")
DF = pd.DataFrame([
{
"Text": sub.text
} for sub in subs])
DF.head()
text = str(subs)
len(nltk.word_tokenize(text))
print(DF.shape)
Fix Errors
- Examine the text for errors or problems by looking at the text.
- Clean the data with examples from class (either impurity or example provided in NER section).
DF['clean2'] = DF['Text'].apply(remove_noise)
DF = DF[DF['clean2'] != ""]
DF.head(10)
DF['score'] = DF['clean2'].apply(impurity)
DF.sort_values(by = ['score'], ascending = False)
DF['clean'] = DF['clean2'].apply(normalize)
DF.head(10)
# find all the unique tokens
# set is find unique
# nltk.word_tokenize is break down into words
# " ".join is combine into one long text
# .to_list() is a function to convert to list
clean_tokens = set(nltk.word_tokenize(" ".join(DF['clean'].to_list())))
# what is wrong?
misspelled = spell.unknown(clean_tokens)
print(misspelled)
# for word in misspelled:
# # what's the word
# print(word)
# print("\n")
# # Get the one `most likely` answer
# print(spell.correction(word))
# # Get a list of `likely` options
# print(spell.candidates(word))
# # make a dictionary of the misspelled word and the correction
# # use find and replace in re to fix them
Pre-Processing
- Using spacy and textacy, pre-process the text to end up with a list of tokenized lists.
output = []
# only the tagger and lemmatizer
for doc in nlp.pipe(DF['clean'].tolist(), disable=["tok2vec", "ner", "parser"]):
tokens = textacy.extract.words(doc,
filter_stops = True, # default True, no stopwords
filter_punct = True, # default True, no punctuation
filter_nums = True, # default False, no numbers
include_pos = None, # default None = include all
exclude_pos = None, # default None = exclude none
min_freq = 1) # minimum frequency of words
# tokens_len = len([str(word) for word in tokens])
# if tokens_len>0:
output.append([str(word) for word in tokens]) # close output append
output = [elem for elem in output if elem]
output[10]
Create a frequency table of each of the tokens returned in this output. Below is some example code to get us started.
# all items
print(type(output))
# first list
print(type(output[100]))
# first list, first item (this is the issue!)
print(type(output[100][0]))
<class 'list'>
<class 'str'>
word_counts = Counter(chain.from_iterable(output))
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
# Convert the sorted list back to a Counter object for presentation
sorted_word_counts_dict = Counter(dict(sorted_word_counts))
print(sorted_word_counts_dict)
Processing Text Summary
- Write a paragraph explaining the process of cleaning data for a your NLP pipeline.
- You should explain the errors you found in the dataset and how you fixed them.
- Why did you think these things were important to fix for this project?
-
So first I import The subtitle of "Interstellar" movie. I have created a datafrae from the text.
-
to get the total number of tokens, I have created a string and then tokenize the text using "spacy" to see how many tokens it has.
-
Next, i clean the data first using "remove_noise" function and then using "imputity" function that we defined using “regex”to take some regular expiration from the text. Remove them to have a cleaner text. The impurity scores is zero for all rows showing that the text is clean.
-
Then, we run the normalize function using "textacy" that we defined to take care of quotation marks Unicodes, … , and to remove phone numbers, urls, emails, emojis and etc.
-
Next, we can check for miss-spelt words. If our text is too long, we better to not do that; especially when the text contains many nouns. Here after checking for misspelled words, we found that most of them are not misspelled. They are nouns of people in the movie that was written in lowercase or they used a nickname . So we did not clean anything here.
-
I did not find any error
-
Used Spacey and texttacy to preprocessed the data end up with a list of tokenized lists, cleaning the data as aexplained above
-
the frequently table shows that the main character is "Murph Cooper". This movie is about "time","Tars', "Earth", and"Space".
Part of Speech Tagging
- Tag your data with spacy’s part of speech tagger.
- Convert this data into a Pandas DataFrame.
- Use the dataframe to calculate the most common parts of speech.
- What is the most common part of speech?
- Do you see words that are multiple parts of speech?
- What can you learn about the text from examining the most common nouns and verbs?
# clean_tokens = set(nltk.sent_tokenize(" ".join(DF['clean'].to_list())))
subtitle = " ".join(DF['clean'].to_list())
text = str(subs)
DF = pd.DataFrame(
# nltk.sent_tokenize(text),
nltk.sent_tokenize(subtitle),
columns = ["sentences"]
)
DF.head(20)
# easier to loop over the big text file than loop over words AND rows in pandas
spacy_pos_tagged = [(str(word), word.tag_, word.pos_) for word in nlp(subtitle)]
# each row represents one token
DF_POS = pd.DataFrame(
spacy_pos_tagged,
columns = ["token", "specific_tag", "upos"]
)
DF_POS.head(20)
- Use the dataframe to calculate the most common parts of speech.
DF_POS['upos'].value_counts()
# punct is not pos. so most common Nouns and verbs
# x are thins it could not figure it out
DF_POS2 = pd.crosstab(DF_POS['token'], DF_POS['upos'])
# convert to true false to add up how many times not zero
DF_POS2['total'] = DF_POS2.astype(bool).sum(axis=1)
#print out the rows that aren't 1
DF_POS2[DF_POS2['total'] > 1]
# What are the 10 most common verb and nouns
nouns_df = DF_POS[DF_POS['upos']=='NOUN']
distinct_noun_count = nouns_df['token'].nunique()
top_10_nouns = nouns_df['token'].value_counts().head(10)
print('10 most common nouns are:\n', top_10_nouns)
verbs_df = DF_POS[DF_POS['upos']=='VERB']
distinct_verbs_count = verbs_df['token'].nunique()
top_10_verbs = verbs_df['token'].value_counts().head(10)
print('10 most common verbs are:\n', top_10_verbs)
time 49
years 35
planet 24
people 24
gravity 20
world 20
way 18
percent 17
life 16
data 16
Name: token, dtype: int64
10 most common verbs are:
have 67
go 51
know 48
get 39
do 33
got 33
gon 29
's 26
need 25
save 24
Name: token, dtype: int64
-
What is the most common part of speech?
-
The nost common parts of speach are : PUNCT, Pron,Verb and Noun
-
Do you see words that are multiple parts of speech?
-
There are 220 tokens that are multiple part of speech. (using crosstabs)
-
What can you learn about the text from examining the most common nouns and verbs?
-
I looked at the 10 most common nouns and verbs in the text.
-
The nouns shows exactly what is the movie about. It says time/years that refers to difference in time in different 'planet' that was the topic of the movie. and they include gravity, world, life and hole which I think was black hole.
-
so the nouns give me a sense if what the text is about. -looking at the 10 most common verbs, they not something special about the topic other than 'go' which refere to the going to another planet or out of earth.
-
it contains 's which was 'is'
-
Some verbs are in past tense which with lemmitazation we could return them to the main peresent tense and in this way they wouldn't be reported as 2 different verbs like 'get' and 'got'.
-And it seems that it did a good job in finding verbs and nouns.
KPE
Use textacy to find the key phrases in your text.
- in the r window for r people
- library(reticulate)
- py_install("networkx < 3.0", pip = T)
# textacy KPE
# build an english language for textacy pipe
en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser"))
# build a processor for textacy using spacy and process text
doc = textacy.make_spacy_doc(subtitle, lang = en)
# text rank algorithm
print([kps for kps, weights in textacy.extract.keyterms.textrank(doc, normalize = "lemma", topn = 5)])
terms = set([term for term, weight in textacy.extract.keyterms.textrank(doc)])
print(textacy.extract.utils.aggregate_term_variants(terms))
[{'good night Brand'}, {'real good time'}, {'couple year'}, {'thing right'}, {'space time'}, {'wrong time'}, {'good Murph'}, {'good thing'}, {'good idea'}, {'year old'}]
TR_keywords = keywords.keywords(subtitle, scores = True)
print(TR_keywords[0:20])
- **Using textacy utilities, combine like key phrases.
- What did you learn about your text by using keyphrase analysis?**
It primarily returned phrases related to time and space-time, which makes sense since one of the main topics of the movie is space-time. However, it also returned some meaningless phrases. I ran Summa as well because it didn't take much time. Using Summa, it identified the names of the main characters "Cooper," "Murph," and "Brand," as well as "years," but it also returned some unimportant or meaningless phrases.
NER + Snorkel
- Use spacy to extract named entities.
- Create a summary of your named entities.
- Apply Snorkel to your data to show any relationship between names.
- What kinds of relationships did you explore? Did you find any?
# easier to loop over the big text file than loop over words AND rows in pandas
spacy_ner_tagged = [(str(word.text), word.label_) for word in nlp(subtitle).ents]
# each row represents one token
DF_NER = pd.DataFrame(
spacy_ner_tagged,
columns = ["token", "entity"]
)
print(DF_NER['entity'].value_counts())
DF_NER2 = pd.crosstab(DF_NER['token'], DF_NER['entity'])
print(DF_NER2)
# convert to true false to add up how many times not zero
DF_NER2['total'] = DF_NER2.astype(bool).sum(axis=1)
#print out the rows that aren't 1
DF_NER2[DF_NER2['total'] > 1]
CARDINAL 85
ORG 76
DATE 69
LOC 24
TIME 18
ORDINAL 17
PERCENT 16
PRODUCT 8
GPE 8
QUANTITY 8
WORK_OF_ART 6
NORP 5
EVENT 1
FAC 1
Name: entity, dtype: int64
entity CARDINAL DATE EVENT FAC GPE LOC NORP ORDINAL ORG \
token
1 2 0 0 0 0 0 0 0 0
10 1 0 0 0 0 0 0 0 0
10 o'clock 0 0 0 0 0 0 0 0 0
10 percent 0 0 0 0 0 0 0 0 0
10 year old 0 1 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ...
twelve Ranger 0 0 0 0 0 0 0 0 0
two 12 0 0 0 0 0 0 0 0
two years 0 1 0 0 0 0 0 0 0
years 0 8 0 0 0 0 0 0 0
zero 1 0 0 0 0 0 0 0 0
entity PERCENT PERSON PRODUCT QUANTITY TIME WORK_OF_ART
token
1 0 0 0 0 0 0
10 0 0 0 0 0 0
10 o'clock 0 0 0 0 1 0
10 percent 1 0 0 0 0 0
10 year old 0 0 0 0 0 0
... ... ... ... ... ... ...
twelve Ranger 0 1 0 0 0 0
two 0 0 0 0 0 0
two years 0 0 0 0 0 0
years 0 0 0 0 0 0
zero 0 0 0 0 0 0
[230 rows x 15 columns]
09:47 min
Apply Snorkel to your data to show any relationship between names.
get the data into a good format
we use this code to store all the entities
stored_entities = []
# first get the entities, must be two for relationship matches
def get_entities(x):
"""
Grabs the names using spacy's entity labeler
"""
# get all the entities in this row
processed = nlp(x)
# get the tokens for each sentence
tokens = [word.text for word in processed]
# get all the entities - notice this is only for persons
temp = [(str(ent), ent.label_) for ent in processed.ents if ent.label_ != ""]
# only move on if this row has at least two
if len(temp) > 1:
# finds all the combinations of pairs
temp2 = list(combinations(temp, 2))
# for each pair combination
for (person1, person2) in temp2:
# find the names in the person 1
person1_words = [word.text for word in nlp(person1[0])]
# find the token numbers for person 1
person1_ids = [i for i, val in enumerate(tokens) if val in person1_words]
# output in (start, stop) token tuple format
if len(person1_words) > 1:
person1_ids2 = tuple(idx for idx in person1_ids[0:2])
else:
id_1 = [idx for idx in person1_ids]
person1_ids2 = (id_1[0], id_1[0])
# do the same thing with person 2
person2_words = [word.text for word in nlp(person2[0])]
person2_ids = [i for i, val in enumerate(tokens) if val in person2_words[0:2]]
if len(person2_words) > 1:
person2_ids2 = tuple(idx for idx in person2_ids[0:2])
else:
id_2 = [idx for idx in person2_ids[0:2]]
person2_ids2 = (id_2[0], id_2[0])
# store all this in a list
stored_entities.append(
[x, # original text
tokens, # tokens
person1[0], # person 1 name
person2[0], # person 2 name
person1_ids2, # person 1 id token tuple
person2_ids2 # person 2 id token tuple
])
DF['sentences'].apply(get_entities)
# create dataframe in snorkel structure
DF_dev = pd.DataFrame(stored_entities, columns = ["sentence", "tokens", "person1", "person2", "person1_word_idx", "person2_word_idx"])
DF_dev
This dataframe has the sentences and it tokens. The first 2 entities (person 1 and person 2). And the tuples that gets get the first and end of instatnces of that particular phrase.
some of these dependencies are meaningless or useless
# live locate home road roads in at street (locations tied together)
# family terms for people
# get words between the data points
@preprocessor()
def get_text_between(cand: DataPoint) -> DataPoint:
"""
Returns the text between the two person mentions in the sentence
"""
start = cand.person1_word_idx[1] + 1
end = cand.person2_word_idx[0]
cand.between_tokens = cand.tokens[start:end]
return cand
# get words next to the data points
@preprocessor()
def get_left_tokens(cand: DataPoint) -> DataPoint:
"""
Returns tokens in the length 3 window to the left of the person mentions
"""
# TODO: need to pass window as input params
window = 5
end = cand.person1_word_idx[0]
cand.person1_left_tokens = cand.tokens[0:end][-1 - window : -1]
end = cand.person2_word_idx[0]
cand.person2_left_tokens = cand.tokens[0:end][-1 - window : -1]
return cand
I have defined different tags to find the meaningful relationship etween them. Not all of them were meaningful
location = {'Earth', 'Ireland', 'Wormhole', 'hole', 'Saturn', 'Mars'}
people = {'Grandpa', 'Tom', 'Murph', 'Hanley', 'Apollo','Miller', 'Cooper', 'Mom', 'Newton',
'Amelia', 'Nelson', 'Morse', 'Brand', 'Doyle', 'Ranger', 'Lazarus'}
time = {'4:00 today', 'tomorrow', 'every day', '40 years', 'a couple hours', 'every minute', 'Twenty three years', 'four months'
'next year', '30 years' , 'Eight months', 'Fourteen months', 'seven years', 'night', 'seven years ago'}
cardinal = {'one', 'two', 'three', 'four', 'five', 'six','seven' , 'eight' ,
'10', '40' , '80', '30', '90', '67' , '72', '50', '68', '95'}
# live locate home road roads in at street (locations tied together)
# family terms for people
found_location = 1
found_family = 1
ABSTAIN = 0
location = cardinal
# location = {"live", "living", "locate", "located", "home", "road", "roads", "street", "streets", "in", "at", "of"}
@labeling_function(resources=dict(location=location), pre=[get_text_between])
def between_location(x, location):
return found_location if len(location.intersection(set(x.between_tokens))) > 0 else ABSTAIN
@labeling_function(resources=dict(location=location), pre=[get_left_tokens])
def left_location(x, location):
if len(set(location).intersection(set(x.person1_left_tokens))) > 0:
return found_location
elif len(set(location).intersection(set(x.person2_left_tokens))) > 0:
return found_location
else:
return ABSTAIN
family = people
# family = {"spouse", "wife", "husband", "ex-wife", "ex-husband", "marry",
# "married", "father", "mother", "sister", "brother", "son", "daughter",
# "grandfather", "grandmother", "uncle", "aunt", "cousin",
# "boyfriend", "girlfriend"}
@labeling_function(resources=dict(family=family), pre=[get_text_between])
def between_family(x, family):
return found_family if len(family.intersection(set(x.between_tokens))) > 0 else ABSTAIN
@labeling_function(resources=dict(family=family), pre=[get_left_tokens])
def left_family(x, family):
if len(set(family).intersection(set(x.person1_left_tokens))) > 0:
return found_family
elif len(set(family).intersection(set(x.person2_left_tokens))) > 0:
return found_family
else:
return ABSTAIN
# create a list of functions to run
lfs = [
between_location,
left_location,
between_family,
left_family
]
# build the applier function
applier = PandasLFApplier(lfs)
# run it on the dataset
L_dev = applier.apply(DF_dev)
L_dev
DF_combined = pd.concat([DF_dev, pd.DataFrame(L_dev, columns = ["location1", "location2", "family1", "family2"])], axis = 1)
DF_combined.head()
DF_combined['location_yes'] = DF_combined['location1'] + DF_combined["location2"]
DF_combined['family_yes'] = DF_combined['family1'] + DF_combined["family2"]
print(DF_combined['location_yes'].value_counts())
print(DF_combined['family_yes'].value_counts())
1 22
2 11
Name: location_yes, dtype: int64
0 99
1 22
2 1
Name: family_yes, dtype: int64
What might you do to improve the default NER extraction?
- WE had entities like"Murph" and "Cooper" and 'Cooper Murph' , .. that referred to the same person and many other entities taht refere to the same entities but in different words. We should do "Named Entity Disambiguation" (NED). After getting the name/keyphrases, Then we have to figuere out what thoes things refer to in real world by putting together NER and NED and do named Entity Linking(NEL).
- We did POS tagging, NER and KPE. WE need parsing. and coreference solution to put together all the same entities. and link a database of definitions of people/places/time/ dates/things.
Knowledge Graphs
Slides Version
- Based on the chosen text, add entities to a default spacy model.
- Add a norm_entity, merge_entity, and init_coref pipelines.
- Update and add the alias lookup if necessary for the data.
- Add the name resolver pipeline.
Or Use Your Snorkel Output
- Create a co-occurrence graph of the entities linked together in your text.
# locations only
DF_loc = DF_combined[DF_combined['location_yes'] > 0]
DF_loc = DF_loc[['person1', 'person2']].reset_index(drop = True)
cooc_loc = DF_loc.groupby(by=["person1", "person2"], as_index=False).size()
# family only
DF_fam = DF_combined[DF_combined['family_yes'] > 0]
DF_fam = DF_fam[['person1', 'person2']].reset_index(drop = True)
cooc_fam = DF_fam.groupby(by=["person1", "person2"], as_index=False).size()
# take out issues where entity 1 == entity 2
cooc_loc = cooc_loc[cooc_loc['person1'] != cooc_loc['person2']]
cooc_fam = cooc_fam[cooc_fam['person1'] != cooc_fam['person2']]
print(cooc_loc.head())
print(cooc_fam.head())
0 67 68 1
1 80 percent Earth 1
2 Deactivate three 1
3 Deactivate two 1
4 T minus 10, nine 60 percent 1
person1 person2 size
0 4:00 today 101 1
1 Amelia 90 percent 1
2 Brand two 1
3 Cooper Gargantua 1
4 Cooper TARS 1
# start by plotting the whole thing for location
cooc_loc_small = cooc_loc[cooc_loc['size']>1]
graph = nx.from_pandas_edgelist(
cooc_loc_small[['person1', 'person2', 'size']] \
.rename(columns={'size': 'weight'}),
source='person1', target='person2', edge_attr=True)
pos = nx.kamada_kawai_layout(graph, weight='weight')
_ = plt.figure(figsize=(20, 20))
nx.draw(graph, pos,
node_size=1000,
node_color='skyblue',
alpha=0.8,
with_labels = True)
plt.title('Graph Visualization', size=15)
for (node1,node2,data) in graph.edges(data=True):
width = data['weight']
_ = nx.draw_networkx_edges(graph,pos,
edgelist=[(node1, node2)],
width=width,
edge_color='#505050',
alpha=0.5)
plt.show()
plt.close()
This graph is meaningless.
cooc_fam
graph = nx.from_pandas_edgelist(
cooc_fam[['person1', 'person2', 'size']] \
.rename(columns={'size': 'weight'}),
source='person1', target='person2', edge_attr=True)
pos = nx.kamada_kawai_layout(graph, weight='weight')
_ = plt.figure(figsize=(20, 20))
nx.draw(graph, pos,
node_size=1000,
node_color='skyblue',
alpha=0.8,
with_labels = True)
plt.title('Graph Visualization', size=15)
for (node1,node2,data) in graph.edges(data=True):
width = data['weight']
_ = nx.draw_networkx_edges(graph,pos,
edgelist=[(node1, node2)],
width=width,
edge_color='#505050',
alpha=0.5)
plt.show()
plt.close()